I play a lot of chess. Like … a lot of chess. From July 2021-January of 2022 I played over 1600 games on Lichess. For this project I’m analyzing all of my games played on Lichess using the chessR package by Jason Zivkovic. This package is an R wrapper for the Lichess API, making it extremely easy to import data from my games into the R environment.
# devtools::install_github("JaseZiv/chessR")
library(chessR)
library(dplyr)
library(ggplot2)
library(ggridges)
library(plotly)
library(lubridate)
Now I’ll retrieve all of my game data from Lichess
lichess_data <- get_raw_lichess('Eldiel_Prime')
## Extracting Eldiel_Prime games. Please wait
I’ll start by using the chessR helper functions num_moves() and get_winner() to determine who won each game, and how many moves were played in each game. Then I’ll perform my own analysis to determine stats like my elo and the opponents elo of each game, which color I played as, determining my result, and converting to PST date-time.
#helper functions from chessR
lichess_data$num_moves <- return_num_moves(moves_string = lichess_data$Moves)
lichess_data$winner <- get_winner(result_column = lichess_data$Result,
white = lichess_data$White,
black = lichess_data$Black)
#change dates to date class
lichess_data$Date <- ymd(lichess_data$Date)
#day of week
lichess_data$day_of_week <- wday(lichess_data$Date, label = TRUE)
#determine which color I played as
lichess_data$my_color <- NA
lichess_data$my_color[lichess_data$White == "eldiel_prime"] <- "White"
lichess_data$my_color[lichess_data$Black == "eldiel_prime"] <- "Black"
#determine my elo and opponent's elo
lichess_data$my_elo <- NA
lichess_data$opp_elo <- NA
lichess_data$my_elo[lichess_data$White == "eldiel_prime"] <- lichess_data$WhiteElo[lichess_data$White == "eldiel_prime"]
lichess_data$my_elo[lichess_data$Black == "eldiel_prime"] <- lichess_data$BlackElo[lichess_data$Black == "eldiel_prime"]
lichess_data$opp_elo[lichess_data$White == "eldiel_prime"] <- lichess_data$BlackElo[lichess_data$White == "eldiel_prime"]
lichess_data$opp_elo[lichess_data$Black == "eldiel_prime"] <- lichess_data$WhiteElo[lichess_data$Black == "eldiel_prime"]
#rename time controls
lichess_data$TimeControl[lichess_data$TimeControl == '300+0'] <- '5+0'
lichess_data$TimeControl[lichess_data$TimeControl == '600+0'] <- '10+0'
lichess_data$TimeControl[lichess_data$TimeControl == '60+0'] <- '1+0'
lichess_data$TimeControl[lichess_data$TimeControl == '180+2'] <- '3+2'
lichess_data$TimeControl[lichess_data$TimeControl == '120+1'] <- '2+1'
lichess_data$TimeControl[lichess_data$TimeControl == '180+0'] <- '3+0'
#determine my result, win loss or draw
lichess_data$my_result <- 'loss'
lichess_data$my_result[lichess_data$winner == 'eldiel_prime'] <- 'win'
lichess_data$my_result[lichess_data$winner == 'Draw'] <- 'draw'
#make factor to assign custom ordering for later use
lichess_data$my_result <- factor(lichess_data$my_result,
levels = c('loss','draw','win'))
#drop rows where opp elo is '?'
lichess_data <- lichess_data[lichess_data$opp_elo != '?',]
#ensure elos are numeric
lichess_data$opp_elo <- as.numeric(lichess_data$opp_elo)
lichess_data$my_elo <- as.numeric(lichess_data$my_elo)
#generate opening counts by color
lichess_data <- lichess_data %>%
group_by(Opening,my_color) %>%
mutate(opening_count_by_color = n())
#generate daily counts of game type
lichess_data <- lichess_data %>%
group_by(Date, Event) %>%
mutate(daily_type_count = n())
#create date-times and convert to local PST
lichess_data$date_time_UTC <- paste(lichess_data$UTCDate, lichess_data$UTCTime)
lichess_data$date_time_UTC <- as_datetime(lichess_data$date_time_UTC)
#now convert to PST
lichess_data$date_time_PST <- format(lichess_data$date_time_UTC,
usetz=TRUE,
tz = 'America/Los_Angeles')
#extract the hour value of each observation in PST
lichess_data$hour <- format(as.POSIXct(lichess_data$date_time_PST),
format = "%H")
Next, I’ll clean the opening names. Chess openings have lots of variations, but I’m not interested in the variations when looking at how many times I’ve played each opening (as some openings can have many dozens of variations). Let’s see how many variations I’ve encountered in my favorite opening: The Queen’s Gambit.
length(unique(lichess_data$Opening[grepl("Queen's Gambit",lichess_data$Opening,
fixed = TRUE)]))
## [1] 36
That’s 36 variations just in a single opening. For my analysis, I want all of the variations to be classified just as the base opening. Here I’ll standardize the opening names a little bit. The grepl() function takes arguments of the ‘needle’ we want to find (root of opening name), and the ‘haystack’ we want to find it in (the Openings column of the dataframe). We can use this to easily standardize the names of openings.
#Needles
opening_names <- c("Caro-Kann","Queen's Gambit","King's Indian",
"Nimzo-Indian Defense","Queen's Pawn Game",
"English Opening","Englund Gambit","French Defense",
"Sicilian Defense","Scandinavian Defense","Benoni Defense",
"Vienna Game","Semi-Slav","Slav Defense","Modern Defense",
"Dutch Defense","Budapest Defense")
for(i in opening_names){
#subset data frame and replace opening names with needle
lichess_data$Opening[grepl(i,lichess_data$Opening,
fixed = TRUE)] <- i
}
#generate total opening counts
lichess_data <- lichess_data %>%
group_by(Opening) %>%
mutate(opening_count = n())
I also want to do some analysis on specific types of moves in my games.
#split 'moves' into individual components
moves<- suppressWarnings(gsub("\\{.*?\\}", "", lichess_data$Moves,
perl = TRUE) %>% strsplit(., "\\s+"))
#most common first move
first_moves <- c()
for(i in moves){
first_moves <- append(first_moves,i[2])
}
#make df of first moves, my color
moves_df <- data.frame(first_moves, lichess_data$my_color)
colnames(moves_df) <- c("first_moves","my_color")
#count occurrences of first moves
moves_df <- moves_df %>%
group_by(first_moves) %>%
mutate(count_of_move = n())
#special moves
special_moves_list <- c("O-O-O", "O-O","#","+","=")
move_type <- c("Queenside Castle","Castle","Checkmate","Check","Promotion")
special_moves_count <- c()
num_temp <- c()
for(i in special_moves_list){
for(j in moves){
temp <- j[grepl(i,j,fixed = TRUE)]
num_temp <- append(num_temp, temp)
}
num_temp <- length(num_temp)
special_moves_count <- append(special_moves_count, num_temp)
}
#subtract queen side castles from castles, as grepl picks up O-O-O when looking for O-O
special_moves_count[2] <- special_moves_count[2] - special_moves_count[1]
#make data frame
special_df <- data.frame('move_type' = move_type,
'count' = special_moves_count)
##Move counts by piece type
piece_list <- c("K","Q","R","B","N")
piece_name <- c("King","Queen","Rook","Bishop","Knight")
piece_counts <- c()
num_temp <- c()
for(i in piece_list){
for(j in moves){
temp <- j[grepl(i,j,fixed = TRUE)]
num_temp <- append(num_temp, temp)
}
num_temp <- length(num_temp)
piece_counts <- append(piece_counts, num_temp)
}
#add castling/queenside to king counts!
piece_counts[1] <- piece_counts[1] + #'K' moves
special_moves_count[2] + #castle
special_moves_count[1] #queenside
#make df
piece_type_df <- data.frame(
'piece' = piece_name,
'move_count' = piece_counts)
Now that I’ve imported and cleaned my data, I’ll visualize some findings. I’ve organized these plots into categories for more cohesive insights.
Which time controls do I play the most?
lichess_data %>% group_by(TimeControl) %>%
count(TimeControl) %>%
ggplot(aes(x= reorder(TimeControl,n), y= n, fill = TimeControl)) +
geom_col(alpha = 1) +
geom_text(aes(label = n), hjust = -0.3) +
coord_flip() +
ylim(0,1380) +
labs(x= "Time Control", y= "Number of Games") +
ggtitle("Most Played Time Controls") +
theme_bw() +
theme(legend.position="none")
What is my win/loss/draw percentage in each time control?
#drop 10+0 because it's all casual games against low elo
ggplotly(
lichess_data[lichess_data$TimeControl %in%
c("1+0","2+1","3+0","3+2","5+0"),] %>%
group_by(TimeControl,my_result) %>%
count(TimeControl) %>%
ggplot(aes(fill = my_result, y = n, x = TimeControl))+
geom_bar(position = "fill",stat = "identity")+
labs(y = "Result %", x = NULL, fill = "My Result") +
ggtitle("Win/Draw/Loss Percentage by Time Control") +
theme_bw()) %>% config(displayModeBar = F)
My most played openings (limited to openings I’ve played 12 or more times).
ggplotly(
lichess_data[lichess_data$opening_count >= 12,] %>%
group_by(Opening,my_color,opening_count) %>%
count(Opening) %>%
ggplot(aes(x= n, y=reorder(Opening, opening_count), fill = my_color)) +
geom_col(position = "stack",stat = 'identity') +
labs(x= "Number of Games", y= NULL, fill = "My Color") +
ggtitle("Most Frequent Openings") +
scale_fill_manual(values=c("sienna4", "wheat1")) +
theme_bw()) %>% config(displayModeBar = F)
How do I perform in each opening?
ggplotly(
lichess_data[lichess_data$opening_count >= 12,] %>%
group_by(Opening,my_result,opening_count) %>%
count(Opening) %>%
ggplot(aes(fill = my_result, y = n, x = reorder(Opening,opening_count)))+
geom_bar(position = "fill",stat = "identity")+
labs(y = "Result %", x = NULL, fill = "My Result") +
coord_flip() +
ggtitle("Win/Draw/Loss by Opening") +
theme_bw()) %>% config(displayModeBar = F)
It seems that I perform well in my two most played openings (Caro Kann and Queen’s Gambit). However, I clearly need to rethink (or better study) The Vienna Game, as I’m losing 59% of my games in this opening, and it’s one I’m playing with the white pieces. Other notable insights are that I perform very strongly agaisnt the modern defense, and very poorly against the Budapest Defense.
Most common first move?
ggplotly(
moves_df %>% group_by(first_moves,my_color,count_of_move) %>%
count(first_moves) %>%
ggplot(aes(x= n, y=reorder(first_moves, count_of_move), fill = my_color)) +
geom_col(position = "stack",stat = 'identity') +
labs(x= "Number of Games", y= NULL, fill = "My Color") +
ggtitle("Most Frequent First Moves") +
scale_fill_manual(values=c("sienna4", "wheat1")) +
theme_bw()) %>% config(displayModeBar = F)
How do I perform in as the number of moves in a game increases?
ggplot(lichess_data, aes(x = num_moves, fill = my_result))+
geom_density(alpha = 0.7) + theme_bw() +
xlim(0,120) +
labs(x = "Number of Moves", fill = 'My Result') +
ggtitle("Performance by Move Count")
Interestingly, I perform better in games with a higher number of moves. This is likely due to the fact that if I make blunders, I resign early and move on to the next game. The distribution of draws in longer games also makes sense, as draws often come from endgames with high move counts.
How often do special move types occur?
#now plot
special_df %>%
ggplot(aes(x= reorder(move_type,-count), y= count, fill = move_type)) +
geom_bar(stat= 'identity', alpha = 1) +
geom_text(aes(label = count), vjust = -0.4) +
labs(y = "Count", x = NULL) +
ylim(0,7000) +
ggtitle("Counts of Special Moves") +
theme_bw() +
theme(legend.position="none")
How has my elo changed over time?
ggplotly(
lichess_data[lichess_data$Event %in%
c("Rated Blitz game","Rated Bullet game"),] %>%
ggplot(aes(x = Date, y = my_elo, color = Event)) +
geom_step() +
ylim(1600,2100) +
labs(y = "Elo", x = NULL) +
ggtitle("Elo Over Time") +
theme_bw()) %>% config(displayModeBar = F)
How do I perform against different elos?
ggplot(lichess_data, aes(x = opp_elo, fill = my_result))+
geom_density(alpha = 0.7) + theme_bw() +
xlim(1600,2100) +
xlab("Opponent Elo") +
ggtitle("Performance by Opponent Elo")
How many games am I playing per day?
ggplotly(
ggplot(data = lichess_data[lichess_data$Date >= "2021-7-1", ],
aes(x = Date, y = daily_type_count, color = Event)) +
geom_step() +
ylim(0,31) +
labs(y = "Number of Games", x = NULL) +
ggtitle("Daily Games Played")+
theme_bw()) %>% config(displayModeBar = F)
During which day(s) of the week do I play the most?
lichess_data %>% group_by(day_of_week) %>%
count(day_of_week) %>%
ggplot(aes(x = day_of_week, y = n, fill = day_of_week)) +
geom_bar(stat= 'identity', alpha = 1) +
geom_text(aes(label = n), vjust = -0.4) +
labs(x = NULL, y = "Games Played") +
ggtitle("Total Games per Weekday") +
theme_bw() +
theme(legend.position="none")
When during the day do I usually play?
lichess_data %>% group_by(hour) %>%
count(hour) %>%
ggplot(aes(x = hour, y = n, fill = hour)) +
geom_bar(stat= 'identity', alpha = 1) +
geom_text(aes(label = n), vjust = -0.4) +
ylim(0,185) +
labs(x = NULL, y = "Games Played") +
ggtitle("Hourly Game Distribution") +
theme_bw() +
theme(legend.position="none")
Which pieces get moved the most?
piece_type_df %>%
ggplot(aes(x= reorder(piece,-move_count), y= move_count, fill = piece)) +
geom_bar(stat= 'identity', alpha = 1) +
geom_text(aes(label = move_count), vjust = -0.4) +
ylim(0,19000) +
labs(y = "Move Count", x = NULL) +
theme_bw() +
theme(legend.position="none")
These orders make sense. Firstly, each player has two of the Knight, Bishop and Rook, os it makes sense that these are moved more than the King and Queen (of which each player only has one). It also makes sense that the knight is the most moved. As a short range piece, the knight needs a lot of moves and a lot of re-routes to get to the action. The Bishops and Rooks are much ‘faster’ pieces which do not have limits on how many squares they can travel each turn, meaning they don’t need as many moves to travel to the action.
For this section, I’ll actually briefly return to some data wrangling. I want to generate counts for the number of times a piece moves to each square on the board. This can be accomplished with the following loop:
values <- c() #for storing counts
files <- c("a","b","c","d","e","f","g","h")
ranks <- c("1","2","3","4","5","6","7","8")
file_list <- c() #for storing each iteration file
rank_list <- c() #for storing each iteration rank
#loop through each file,rank (to get a square)
for(i in files){
for(j in ranks){
#we now have a selected square with coordinates i,j
count_temp <- c()
for(k in moves){
temp <- k[grepl(paste(i,j,sep = ""),k,fixed = TRUE)]
count_temp <- append(count_temp,temp)
}
count_temp <- length(count_temp)
values <- append(values, count_temp) #store counts
file_list <- append(file_list, i) #append file
rank_list <- append(rank_list, j) #append rank
}
}
#combine into dataframe
grid <- data.frame(file = file_list,
rank = as.numeric(rank_list),
count = values)
#add castling and queenside castling to g1, g8, c1 and c8
#I will assume equal castling counts between colors
grid$count[grid$file == 'g' & grid$rank == 1] <-
floor(grid$count[grid$file == 'g' & grid$rank == 1] +
0.5*(special_moves_count[2]))
grid$count[grid$file == 'g' & grid$rank == 8] <-
floor(grid$count[grid$file == 'g' & grid$rank == 8] +
0.5*(special_moves_count[2]))
grid$count[grid$file == 'c' & grid$rank == 1] <-
floor(grid$count[grid$file == 'c' & grid$rank == 1] +
0.5*(special_moves_count[1]))
grid$count[grid$file == 'c' & grid$rank == 8] <-
floor(grid$count[grid$file == 'c' & grid$rank == 8] +
0.5*(special_moves_count[1]))
Now that I have a dataframe of counts for each square, I’ll create a heatmap.
ggplot(grid, aes(x = file, y = rank, fill = count)) +
geom_tile(color = "white",
lwd = 0.5,
linetype = 1) +
geom_text(aes(label = paste(file,rank, sep = '')),
color = "white", size = 2.5, vjust = -1) +
geom_text(aes(label = count), color = "white", size = 2.5, vjust = 1) +
coord_fixed() +
scale_fill_gradient(low = 'blue4',
high = 'red1') +
guides(fill = guide_colourbar(barwidth = 0.5,
barheight = 10)) +
ggtitle("Number of Moves to Each Square") +
theme_void() +
theme(plot.title = element_text(hjust = 0.5))
Most of the action is happening in the center, as it should! The next most popular squares are f3, c3, f6, and c6, where the knights are usually developed to.